# scripts/train_tokenizer.py
import os
import sentencepiece as spm

# ========================
# 配置
# ========================
DATA_DIR = "../data"
TOKENIZER_DIR = "../tokenizer"
os.makedirs(TOKENIZER_DIR, exist_ok=True)

INPUT_TEXT = os.path.join(DATA_DIR, "pretrain_corpus.txt")
MODEL_PREFIX = os.path.join(TOKENIZER_DIR, "pretrain_corpus_sp")
VOCAB_SIZE = 32000

# ========================
# 训练 SentencePiece BPE 分词器
# ========================
spm.SentencePieceTrainer.train(
    input=INPUT_TEXT,                # 语料文件路径
    model_prefix=MODEL_PREFIX,        # 输出模型文件名前缀
    vocab_size=VOCAB_SIZE,            # 词表大小
    model_type='bpe',                  # 分词算法：BPE
    character_coverage=1.0,            # 覆盖全部字符（适合中英文混合语料）
    bos_id=-1,                         # 不自动添加句首标记
    unk_id=0,
    eos_id=1,                         # 自动添加句尾标记
    pad_id=2,
)
